library(ggplot2)
library(tidyverse)
library(dplyr)
library(Seurat)
library(SingleCellExperiment)
library(scater)
library(SCpubr)
setwd("~/Projects/HumanThymusProject/")
source("~/Projects/HumanThymusProject/scripts/colors_universal.R")
The Human Thymus Atlas was downloaded as a h5ad file
from cellxgene,
the one containing 255,901 cells.
# import h5ad object downloaded
sce_human_cellgene <- zellkonverter::readH5AD("~/Projects/HumanThymusProject/data_github/park_dataset/c6e08ab6-ab3b-41dc-8058-8e6442e081ec.h5ad")
print(sce_human_cellgene)
## class: SingleCellExperiment
## dim: 32839 255901
## metadata(5): cell_type_ontology_term_id_colors citation
## schema_reference schema_version title
## assays(1): X
## rownames(32839): ENSG00000000003 ENSG00000000005 ... ENSG00000283118
## ENSG00000283125
## rowData names(5): feature_is_filtered feature_name feature_reference
## feature_biotype feature_length
## colnames(255901): FCAImmP7179369-AAACCTGAGCCCAATT
## FCAImmP7179369-AAACCTGAGCCTATGT ...
## Human_colon_16S7985397-TTTGTCAAGCTGAACG
## Human_colon_16S7985397-TTTGTCAGTATTAGCC
## colData names(30): assay_ontology_term_id cell_type_ontology_term_id
## ... development_stage observation_joinid
## reducedDimNames(1): X_umap
## mainExpName: NULL
## altExpNames(0):
# check out metadata
colData(sce_human_cellgene)
## DataFrame with 255901 rows and 30 columns
## assay_ontology_term_id
## <factor>
## FCAImmP7179369-AAACCTGAGCCCAATT EFO:0009899
## FCAImmP7179369-AAACCTGAGCCTATGT EFO:0009899
## FCAImmP7179369-AAACCTGAGTCGCCGT EFO:0009899
## FCAImmP7179369-AAACCTGCAGCATGAG EFO:0009899
## FCAImmP7179369-AAACCTGGTCTCCCTA EFO:0009899
## ... ...
## Human_colon_16S7985397-TTTGGTTCAAACCTAC EFO:0011025
## Human_colon_16S7985397-TTTGGTTCATGGTAGG EFO:0011025
## Human_colon_16S7985397-TTTGGTTTCAAGGTAA EFO:0011025
## Human_colon_16S7985397-TTTGTCAAGCTGAACG EFO:0011025
## Human_colon_16S7985397-TTTGTCAGTATTAGCC EFO:0011025
## cell_type_ontology_term_id
## <factor>
## FCAImmP7179369-AAACCTGAGCCCAATT CL:0000625
## FCAImmP7179369-AAACCTGAGCCTATGT CL:0000624
## FCAImmP7179369-AAACCTGAGTCGCCGT CL:0000809
## FCAImmP7179369-AAACCTGCAGCATGAG CL:0000625
## FCAImmP7179369-AAACCTGGTCTCCCTA CL:0000915
## ... ...
## Human_colon_16S7985397-TTTGGTTCAAACCTAC CL:0000809
## Human_colon_16S7985397-TTTGGTTCATGGTAGG CL:0000809
## Human_colon_16S7985397-TTTGGTTTCAAGGTAA CL:0000809
## Human_colon_16S7985397-TTTGTCAAGCTGAACG CL:0000809
## Human_colon_16S7985397-TTTGTCAGTATTAGCC CL:0000235
## development_stage_ontology_term_id
## <factor>
## FCAImmP7179369-AAACCTGAGCCCAATT HsapDv:0000053
## FCAImmP7179369-AAACCTGAGCCTATGT HsapDv:0000053
## FCAImmP7179369-AAACCTGAGTCGCCGT HsapDv:0000053
## FCAImmP7179369-AAACCTGCAGCATGAG HsapDv:0000053
## FCAImmP7179369-AAACCTGGTCTCCCTA HsapDv:0000053
## ... ...
## Human_colon_16S7985397-TTTGGTTCAAACCTAC HsapDv:0000047
## Human_colon_16S7985397-TTTGGTTCATGGTAGG HsapDv:0000047
## Human_colon_16S7985397-TTTGGTTTCAAGGTAA HsapDv:0000047
## Human_colon_16S7985397-TTTGTCAAGCTGAACG HsapDv:0000047
## Human_colon_16S7985397-TTTGTCAGTATTAGCC HsapDv:0000047
## disease_ontology_term_id
## <factor>
## FCAImmP7179369-AAACCTGAGCCCAATT PATO:0000461
## FCAImmP7179369-AAACCTGAGCCTATGT PATO:0000461
## FCAImmP7179369-AAACCTGAGTCGCCGT PATO:0000461
## FCAImmP7179369-AAACCTGCAGCATGAG PATO:0000461
## FCAImmP7179369-AAACCTGGTCTCCCTA PATO:0000461
## ... ...
## Human_colon_16S7985397-TTTGGTTCAAACCTAC PATO:0000461
## Human_colon_16S7985397-TTTGGTTCATGGTAGG PATO:0000461
## Human_colon_16S7985397-TTTGGTTTCAAGGTAA PATO:0000461
## Human_colon_16S7985397-TTTGTCAAGCTGAACG PATO:0000461
## Human_colon_16S7985397-TTTGTCAGTATTAGCC PATO:0000461
## self_reported_ethnicity_ontology_term_id
## <factor>
## FCAImmP7179369-AAACCTGAGCCCAATT unknown
## FCAImmP7179369-AAACCTGAGCCTATGT unknown
## FCAImmP7179369-AAACCTGAGTCGCCGT unknown
## FCAImmP7179369-AAACCTGCAGCATGAG unknown
## FCAImmP7179369-AAACCTGGTCTCCCTA unknown
## ... ...
## Human_colon_16S7985397-TTTGGTTCAAACCTAC unknown
## Human_colon_16S7985397-TTTGGTTCATGGTAGG unknown
## Human_colon_16S7985397-TTTGGTTTCAAGGTAA unknown
## Human_colon_16S7985397-TTTGTCAAGCTGAACG unknown
## Human_colon_16S7985397-TTTGTCAGTATTAGCC unknown
## is_primary_data
## <logical>
## FCAImmP7179369-AAACCTGAGCCCAATT TRUE
## FCAImmP7179369-AAACCTGAGCCTATGT TRUE
## FCAImmP7179369-AAACCTGAGTCGCCGT TRUE
## FCAImmP7179369-AAACCTGCAGCATGAG TRUE
## FCAImmP7179369-AAACCTGGTCTCCCTA TRUE
## ... ...
## Human_colon_16S7985397-TTTGGTTCAAACCTAC TRUE
## Human_colon_16S7985397-TTTGGTTCATGGTAGG TRUE
## Human_colon_16S7985397-TTTGGTTTCAAGGTAA TRUE
## Human_colon_16S7985397-TTTGTCAAGCTGAACG TRUE
## Human_colon_16S7985397-TTTGTCAGTATTAGCC TRUE
## organism_ontology_term_id
## <factor>
## FCAImmP7179369-AAACCTGAGCCCAATT NCBITaxon:9606
## FCAImmP7179369-AAACCTGAGCCTATGT NCBITaxon:9606
## FCAImmP7179369-AAACCTGAGTCGCCGT NCBITaxon:9606
## FCAImmP7179369-AAACCTGCAGCATGAG NCBITaxon:9606
## FCAImmP7179369-AAACCTGGTCTCCCTA NCBITaxon:9606
## ... ...
## Human_colon_16S7985397-TTTGGTTCAAACCTAC NCBITaxon:9606
## Human_colon_16S7985397-TTTGGTTCATGGTAGG NCBITaxon:9606
## Human_colon_16S7985397-TTTGGTTTCAAGGTAA NCBITaxon:9606
## Human_colon_16S7985397-TTTGTCAAGCTGAACG NCBITaxon:9606
## Human_colon_16S7985397-TTTGTCAGTATTAGCC NCBITaxon:9606
## sex_ontology_term_id
## <factor>
## FCAImmP7179369-AAACCTGAGCCCAATT PATO:0000384
## FCAImmP7179369-AAACCTGAGCCTATGT PATO:0000384
## FCAImmP7179369-AAACCTGAGTCGCCGT PATO:0000384
## FCAImmP7179369-AAACCTGCAGCATGAG PATO:0000384
## FCAImmP7179369-AAACCTGGTCTCCCTA PATO:0000384
## ... ...
## Human_colon_16S7985397-TTTGGTTCAAACCTAC PATO:0000384
## Human_colon_16S7985397-TTTGGTTCATGGTAGG PATO:0000384
## Human_colon_16S7985397-TTTGGTTTCAAGGTAA PATO:0000384
## Human_colon_16S7985397-TTTGTCAAGCTGAACG PATO:0000384
## Human_colon_16S7985397-TTTGTCAGTATTAGCC PATO:0000384
## tissue_ontology_term_id
## <factor>
## FCAImmP7179369-AAACCTGAGCCCAATT UBERON:0002370
## FCAImmP7179369-AAACCTGAGCCTATGT UBERON:0002370
## FCAImmP7179369-AAACCTGAGTCGCCGT UBERON:0002370
## FCAImmP7179369-AAACCTGCAGCATGAG UBERON:0002370
## FCAImmP7179369-AAACCTGGTCTCCCTA UBERON:0002370
## ... ...
## Human_colon_16S7985397-TTTGGTTCAAACCTAC UBERON:0002370
## Human_colon_16S7985397-TTTGGTTCATGGTAGG UBERON:0002370
## Human_colon_16S7985397-TTTGGTTTCAAGGTAA UBERON:0002370
## Human_colon_16S7985397-TTTGTCAAGCTGAACG UBERON:0002370
## Human_colon_16S7985397-TTTGTCAGTATTAGCC UBERON:0002370
## Sample n_counts n_genes
## <factor> <numeric> <numeric>
## FCAImmP7179369-AAACCTGAGCCCAATT F21_TH_45P 8738 1898
## FCAImmP7179369-AAACCTGAGCCTATGT F21_TH_45P 3627 1210
## FCAImmP7179369-AAACCTGAGTCGCCGT F21_TH_45P 14187 3153
## FCAImmP7179369-AAACCTGCAGCATGAG F21_TH_45P 12309 2387
## FCAImmP7179369-AAACCTGGTCTCCCTA F21_TH_45P 9128 2439
## ... ... ... ...
## Human_colon_16S7985397-TTTGGTTCAAACCTAC F74_TH_TOT_5GEX_2 16151 4158
## Human_colon_16S7985397-TTTGGTTCATGGTAGG F74_TH_TOT_5GEX_2 5315 1929
## Human_colon_16S7985397-TTTGGTTTCAAGGTAA F74_TH_TOT_5GEX_2 5386 2084
## Human_colon_16S7985397-TTTGTCAAGCTGAACG F74_TH_TOT_5GEX_2 5096 1735
## Human_colon_16S7985397-TTTGTCAGTATTAGCC F74_TH_TOT_5GEX_2 3335 1187
## donor_id sort method
## <factor> <factor> <factor>
## FCAImmP7179369-AAACCTGAGCCCAATT F21 45P 3GEX
## FCAImmP7179369-AAACCTGAGCCTATGT F21 45P 3GEX
## FCAImmP7179369-AAACCTGAGTCGCCGT F21 45P 3GEX
## FCAImmP7179369-AAACCTGCAGCATGAG F21 45P 3GEX
## FCAImmP7179369-AAACCTGGTCTCCCTA F21 45P 3GEX
## ... ... ... ...
## Human_colon_16S7985397-TTTGGTTCAAACCTAC F74 TOT 5GEX
## Human_colon_16S7985397-TTTGGTTCATGGTAGG F74 TOT 5GEX
## Human_colon_16S7985397-TTTGGTTTCAAGGTAA F74 TOT 5GEX
## Human_colon_16S7985397-TTTGTCAAGCTGAACG F74 TOT 5GEX
## Human_colon_16S7985397-TTTGTCAGTATTAGCC F74 TOT 5GEX
## file mito
## <factor> <numeric>
## FCAImmP7179369-AAACCTGAGCCCAATT FCAImmP7179369 0.0215152
## FCAImmP7179369-AAACCTGAGCCTATGT FCAImmP7179369 0.0308795
## FCAImmP7179369-AAACCTGAGTCGCCGT FCAImmP7179369 0.0225559
## FCAImmP7179369-AAACCTGCAGCATGAG FCAImmP7179369 0.0279470
## FCAImmP7179369-AAACCTGGTCTCCCTA FCAImmP7179369 0.0256354
## ... ... ...
## Human_colon_16S7985397-TTTGGTTCAAACCTAC Human_colon_16S7985397 0.0191319
## Human_colon_16S7985397-TTTGGTTCATGGTAGG Human_colon_16S7985397 0.0180621
## Human_colon_16S7985397-TTTGGTTTCAAGGTAA Human_colon_16S7985397 0.0298923
## Human_colon_16S7985397-TTTGTCAAGCTGAACG Human_colon_16S7985397 0.0141287
## Human_colon_16S7985397-TTTGTCAGTATTAGCC Human_colon_16S7985397 0.0083958
## doublet_scores predicted_doublets
## <numeric> <logical>
## FCAImmP7179369-AAACCTGAGCCCAATT 0.2093023 FALSE
## FCAImmP7179369-AAACCTGAGCCTATGT 0.1118421 FALSE
## FCAImmP7179369-AAACCTGAGTCGCCGT 0.0433071 FALSE
## FCAImmP7179369-AAACCTGCAGCATGAG 0.1118421 FALSE
## FCAImmP7179369-AAACCTGGTCTCCCTA 0.1610169 FALSE
## ... ... ...
## Human_colon_16S7985397-TTTGGTTCAAACCTAC 0.4117647 FALSE
## Human_colon_16S7985397-TTTGGTTCATGGTAGG 0.0497738 FALSE
## Human_colon_16S7985397-TTTGGTTTCAAGGTAA 0.0556439 FALSE
## Human_colon_16S7985397-TTTGTCAAGCTGAACG 0.0556439 FALSE
## Human_colon_16S7985397-TTTGTCAGTATTAGCC 0.1176471 FALSE
## suspension_type tissue_type
## <factor> <factor>
## FCAImmP7179369-AAACCTGAGCCCAATT cell tissue
## FCAImmP7179369-AAACCTGAGCCTATGT cell tissue
## FCAImmP7179369-AAACCTGAGTCGCCGT cell tissue
## FCAImmP7179369-AAACCTGCAGCATGAG cell tissue
## FCAImmP7179369-AAACCTGGTCTCCCTA cell tissue
## ... ... ...
## Human_colon_16S7985397-TTTGGTTCAAACCTAC cell tissue
## Human_colon_16S7985397-TTTGGTTCATGGTAGG cell tissue
## Human_colon_16S7985397-TTTGGTTTCAAGGTAA cell tissue
## Human_colon_16S7985397-TTTGTCAAGCTGAACG cell tissue
## Human_colon_16S7985397-TTTGTCAGTATTAGCC cell tissue
## cell_type
## <factor>
## FCAImmP7179369-AAACCTGAGCCCAATT CD8-positive, alpha-beta T cell
## FCAImmP7179369-AAACCTGAGCCTATGT CD4-positive, alpha-beta T cell
## FCAImmP7179369-AAACCTGAGTCGCCGT double-positive, alpha-beta thymocyte
## FCAImmP7179369-AAACCTGCAGCATGAG CD8-positive, alpha-beta T cell
## FCAImmP7179369-AAACCTGGTCTCCCTA CD8-alpha-alpha-positive, alpha-beta intraepithelial T cell
## ... ...
## Human_colon_16S7985397-TTTGGTTCAAACCTAC double-positive, alpha-beta thymocyte
## Human_colon_16S7985397-TTTGGTTCATGGTAGG double-positive, alpha-beta thymocyte
## Human_colon_16S7985397-TTTGGTTTCAAGGTAA double-positive, alpha-beta thymocyte
## Human_colon_16S7985397-TTTGTCAAGCTGAACG double-positive, alpha-beta thymocyte
## Human_colon_16S7985397-TTTGTCAGTATTAGCC macrophage
## assay disease organism
## <factor> <factor> <factor>
## FCAImmP7179369-AAACCTGAGCCCAATT 10x 3' v2 normal Homo sapiens
## FCAImmP7179369-AAACCTGAGCCTATGT 10x 3' v2 normal Homo sapiens
## FCAImmP7179369-AAACCTGAGTCGCCGT 10x 3' v2 normal Homo sapiens
## FCAImmP7179369-AAACCTGCAGCATGAG 10x 3' v2 normal Homo sapiens
## FCAImmP7179369-AAACCTGGTCTCCCTA 10x 3' v2 normal Homo sapiens
## ... ... ... ...
## Human_colon_16S7985397-TTTGGTTCAAACCTAC 10x 5' v1 normal Homo sapiens
## Human_colon_16S7985397-TTTGGTTCATGGTAGG 10x 5' v1 normal Homo sapiens
## Human_colon_16S7985397-TTTGGTTTCAAGGTAA 10x 5' v1 normal Homo sapiens
## Human_colon_16S7985397-TTTGTCAAGCTGAACG 10x 5' v1 normal Homo sapiens
## Human_colon_16S7985397-TTTGTCAGTATTAGCC 10x 5' v1 normal Homo sapiens
## sex tissue
## <factor> <factor>
## FCAImmP7179369-AAACCTGAGCCCAATT male thymus
## FCAImmP7179369-AAACCTGAGCCTATGT male thymus
## FCAImmP7179369-AAACCTGAGTCGCCGT male thymus
## FCAImmP7179369-AAACCTGCAGCATGAG male thymus
## FCAImmP7179369-AAACCTGGTCTCCCTA male thymus
## ... ... ...
## Human_colon_16S7985397-TTTGGTTCAAACCTAC male thymus
## Human_colon_16S7985397-TTTGGTTCATGGTAGG male thymus
## Human_colon_16S7985397-TTTGGTTTCAAGGTAA male thymus
## Human_colon_16S7985397-TTTGTCAAGCTGAACG male thymus
## Human_colon_16S7985397-TTTGTCAGTATTAGCC male thymus
## self_reported_ethnicity
## <factor>
## FCAImmP7179369-AAACCTGAGCCCAATT unknown
## FCAImmP7179369-AAACCTGAGCCTATGT unknown
## FCAImmP7179369-AAACCTGAGTCGCCGT unknown
## FCAImmP7179369-AAACCTGCAGCATGAG unknown
## FCAImmP7179369-AAACCTGGTCTCCCTA unknown
## ... ...
## Human_colon_16S7985397-TTTGGTTCAAACCTAC unknown
## Human_colon_16S7985397-TTTGGTTCATGGTAGG unknown
## Human_colon_16S7985397-TTTGGTTTCAAGGTAA unknown
## Human_colon_16S7985397-TTTGTCAAGCTGAACG unknown
## Human_colon_16S7985397-TTTGTCAGTATTAGCC unknown
## development_stage
## <factor>
## FCAImmP7179369-AAACCTGAGCCCAATT 16th week post-fertilization human stage
## FCAImmP7179369-AAACCTGAGCCTATGT 16th week post-fertilization human stage
## FCAImmP7179369-AAACCTGAGTCGCCGT 16th week post-fertilization human stage
## FCAImmP7179369-AAACCTGCAGCATGAG 16th week post-fertilization human stage
## FCAImmP7179369-AAACCTGGTCTCCCTA 16th week post-fertilization human stage
## ... ...
## Human_colon_16S7985397-TTTGGTTCAAACCTAC 10th week post-fertilization human stage
## Human_colon_16S7985397-TTTGGTTCATGGTAGG 10th week post-fertilization human stage
## Human_colon_16S7985397-TTTGGTTTCAAGGTAA 10th week post-fertilization human stage
## Human_colon_16S7985397-TTTGTCAAGCTGAACG 10th week post-fertilization human stage
## Human_colon_16S7985397-TTTGTCAGTATTAGCC 10th week post-fertilization human stage
## observation_joinid
## <character>
## FCAImmP7179369-AAACCTGAGCCCAATT X+W(sH{JAl
## FCAImmP7179369-AAACCTGAGCCTATGT xsX~{(Sp*z
## FCAImmP7179369-AAACCTGAGTCGCCGT <6<|6YaXP>
## FCAImmP7179369-AAACCTGCAGCATGAG YP@l>cc;QS
## FCAImmP7179369-AAACCTGGTCTCCCTA nM;U|cg{S^
## ... ...
## Human_colon_16S7985397-TTTGGTTCAAACCTAC `K_xYCtC9f
## Human_colon_16S7985397-TTTGGTTCATGGTAGG r8Nu6Wu^p#
## Human_colon_16S7985397-TTTGGTTTCAAGGTAA Th}*MaCPmt
## Human_colon_16S7985397-TTTGTCAAGCTGAACG O-ymHp#;{~
## Human_colon_16S7985397-TTTGTCAGTATTAGCC 2JdjrhlIRC
# check umap and cell type annotation
plotReducedDim(sce_human_cellgene, dimred="X_umap", colour_by="cell_type")
# check out gene names
rowData(sce_human_cellgene)
## DataFrame with 32839 rows and 5 columns
## feature_is_filtered feature_name feature_reference
## <logical> <factor> <factor>
## ENSG00000000003 FALSE TSPAN6 NCBITaxon:9606
## ENSG00000000005 FALSE TNMD NCBITaxon:9606
## ENSG00000000419 FALSE DPM1 NCBITaxon:9606
## ENSG00000000457 FALSE SCYL3 NCBITaxon:9606
## ENSG00000000460 FALSE C1orf112 NCBITaxon:9606
## ... ... ... ...
## ENSG00000283096 FALSE ENSG00000283096.1 NCBITaxon:9606
## ENSG00000283103 FALSE ENSG00000283103.5 NCBITaxon:9606
## ENSG00000283117 FALSE MGC4859 NCBITaxon:9606
## ENSG00000283118 FALSE ENSG00000283118.1 NCBITaxon:9606
## ENSG00000283125 FALSE ENSG00000283125.1 NCBITaxon:9606
## feature_biotype feature_length
## <factor> <factor>
## ENSG00000000003 gene 4530
## ENSG00000000005 gene 1476
## ENSG00000000419 gene 9276
## ENSG00000000457 gene 6883
## ENSG00000000460 gene 5970
## ... ... ...
## ENSG00000283096 gene 1259
## ENSG00000283103 gene 4585
## ENSG00000283117 gene 3118
## ENSG00000283118 gene 644
## ENSG00000283125 gene 547
table(duplicated(rowData(sce_human_cellgene)$feature_name), useNA="ifany") # none of the gene symbols are duplicated (ideal :)
##
## FALSE
## 32839
The cell annotation doesn’t correspond to Figure 1 of the Park et
al. paper. We are looking for different annotation levels such as
the ones on the cellatlas.io
website. We downloaded the h5ad file from cellatlas.io,
let’s import it.
# import h5ad file from cellatlas.io
sce_human_cellatlas <- zellkonverter::readH5AD("~/Projects/HumanThymusProject/data_github/park_dataset/HTA08_v01_A05_Science_human_fig1.h5ad")
print(sce_human_cellatlas)
## class: SingleCellExperiment
## dim: 33694 255901
## metadata(0):
## assays(1): X
## rownames(33694): TSPAN6 TNMD ... RP11-107E5.4 RP11-299P2.2
## rowData names(0):
## colnames(255901): FCAImmP7179369-AAACCTGAGCCCAATT
## FCAImmP7179369-AAACCTGAGCCTATGT ...
## Human_colon_16S7985397-TTTGTCAAGCTGAACG
## Human_colon_16S7985397-TTTGTCAGTATTAGCC
## colData names(16): Anno_level_1 Anno_level_2 ... Gender Source
## reducedDimNames(1): X_umap
## mainExpName: NULL
## altExpNames(0):
# check out metadata
colData(sce_human_cellatlas) # much better!
## DataFrame with 255901 rows and 16 columns
## Anno_level_1 Anno_level_2 Anno_level_3
## <factor> <factor> <factor>
## FCAImmP7179369-AAACCTGAGCCCAATT T SP T_naive
## FCAImmP7179369-AAACCTGAGCCTATGT T SP T_naive
## FCAImmP7179369-AAACCTGAGTCGCCGT T DP DP
## FCAImmP7179369-AAACCTGCAGCATGAG T SP T_naive
## FCAImmP7179369-AAACCTGGTCTCCCTA T SP CD8αα(I)
## ... ... ... ...
## Human_colon_16S7985397-TTTGGTTCAAACCTAC T DP DP
## Human_colon_16S7985397-TTTGGTTCATGGTAGG T DP DP
## Human_colon_16S7985397-TTTGGTTTCAAGGTAA T DP DP
## Human_colon_16S7985397-TTTGTCAAGCTGAACG T DP DP
## Human_colon_16S7985397-TTTGTCAGTATTAGCC Myeloid Mac/Mono Mac
## Anno_level_4 Anno_level_5
## <factor> <factor>
## FCAImmP7179369-AAACCTGAGCCCAATT CD8+T CD8+T
## FCAImmP7179369-AAACCTGAGCCTATGT CD4+T CD4+T
## FCAImmP7179369-AAACCTGAGTCGCCGT DP DP(P)
## FCAImmP7179369-AAACCTGCAGCATGAG CD8+T CD8+T
## FCAImmP7179369-AAACCTGGTCTCCCTA CD8αα(I) CD8αα(I)
## ... ... ...
## Human_colon_16S7985397-TTTGGTTCAAACCTAC DP DP(P)
## Human_colon_16S7985397-TTTGGTTCATGGTAGG DP DP(Q)
## Human_colon_16S7985397-TTTGGTTTCAAGGTAA DP DP(P)
## Human_colon_16S7985397-TTTGTCAAGCTGAACG DP DP(Q)
## Human_colon_16S7985397-TTTGTCAGTATTAGCC Mac Mac
## Anno_level_fig1 Sample
## <factor> <factor>
## FCAImmP7179369-AAACCTGAGCCCAATT CD8+T F21_TH_45P
## FCAImmP7179369-AAACCTGAGCCTATGT CD4+T F21_TH_45P
## FCAImmP7179369-AAACCTGAGTCGCCGT DP F21_TH_45P
## FCAImmP7179369-AAACCTGCAGCATGAG CD8+T F21_TH_45P
## FCAImmP7179369-AAACCTGGTCTCCCTA CD8αα F21_TH_45P
## ... ... ...
## Human_colon_16S7985397-TTTGGTTCAAACCTAC DP F74_TH_TOT_5GEX_2
## Human_colon_16S7985397-TTTGGTTCATGGTAGG DP F74_TH_TOT_5GEX_2
## Human_colon_16S7985397-TTTGGTTTCAAGGTAA DP F74_TH_TOT_5GEX_2
## Human_colon_16S7985397-TTTGTCAAGCTGAACG DP F74_TH_TOT_5GEX_2
## Human_colon_16S7985397-TTTGTCAGTATTAGCC Mac F74_TH_TOT_5GEX_2
## donor organ sort method
## <factor> <factor> <factor> <factor>
## FCAImmP7179369-AAACCTGAGCCCAATT F21 TH 45P 3GEX
## FCAImmP7179369-AAACCTGAGCCTATGT F21 TH 45P 3GEX
## FCAImmP7179369-AAACCTGAGTCGCCGT F21 TH 45P 3GEX
## FCAImmP7179369-AAACCTGCAGCATGAG F21 TH 45P 3GEX
## FCAImmP7179369-AAACCTGGTCTCCCTA F21 TH 45P 3GEX
## ... ... ... ... ...
## Human_colon_16S7985397-TTTGGTTCAAACCTAC F74 TH TOT 5GEX
## Human_colon_16S7985397-TTTGGTTCATGGTAGG F74 TH TOT 5GEX
## Human_colon_16S7985397-TTTGGTTTCAAGGTAA F74 TH TOT 5GEX
## Human_colon_16S7985397-TTTGTCAAGCTGAACG F74 TH TOT 5GEX
## Human_colon_16S7985397-TTTGTCAGTATTAGCC F74 TH TOT 5GEX
## file Anno_stage
## <factor> <factor>
## FCAImmP7179369-AAACCTGAGCCCAATT FCAImmP7179369 CD8+T_middle
## FCAImmP7179369-AAACCTGAGCCTATGT FCAImmP7179369 CD4+T_middle
## FCAImmP7179369-AAACCTGAGTCGCCGT FCAImmP7179369 DP_middle
## FCAImmP7179369-AAACCTGCAGCATGAG FCAImmP7179369 CD8+T_middle
## FCAImmP7179369-AAACCTGGTCTCCCTA FCAImmP7179369 CD8αα_middle
## ... ... ...
## Human_colon_16S7985397-TTTGGTTCAAACCTAC Human_colon_16S7985397 DP_early
## Human_colon_16S7985397-TTTGGTTCATGGTAGG Human_colon_16S7985397 DP_early
## Human_colon_16S7985397-TTTGGTTTCAAGGTAA Human_colon_16S7985397 DP_early
## Human_colon_16S7985397-TTTGTCAAGCTGAACG Human_colon_16S7985397 DP_early
## Human_colon_16S7985397-TTTGTCAGTATTAGCC Human_colon_16S7985397 Mac_early
## Age Gender Source
## <factor> <factor> <factor>
## FCAImmP7179369-AAACCTGAGCCCAATT 16w Male HDBR
## FCAImmP7179369-AAACCTGAGCCTATGT 16w Male HDBR
## FCAImmP7179369-AAACCTGAGTCGCCGT 16w Male HDBR
## FCAImmP7179369-AAACCTGCAGCATGAG 16w Male HDBR
## FCAImmP7179369-AAACCTGGTCTCCCTA 16w Male HDBR
## ... ... ... ...
## Human_colon_16S7985397-TTTGGTTCAAACCTAC 10w Male HDBR
## Human_colon_16S7985397-TTTGGTTCATGGTAGG 10w Male HDBR
## Human_colon_16S7985397-TTTGGTTTCAAGGTAA 10w Male HDBR
## Human_colon_16S7985397-TTTGTCAAGCTGAACG 10w Male HDBR
## Human_colon_16S7985397-TTTGTCAGTATTAGCC 10w Male HDBR
# check umap and cell type annotation
plotReducedDim(sce_human_cellatlas, dimred="X_umap", colour_by="Anno_level_fig1")
# check out gene names
# rowData(sce_human_cellatlas) # rowData is empty
table(duplicated(rownames(sce_human_cellatlas)), useNA="ifany") # some gene symbols are duplicated (not ideal...)
##
## FALSE TRUE
## 33660 34
In summary:
h5ad file from cellxgene
contains more information on feature names (with unique gene
symbols);h5ad file from cellatlas.io
contains more/better cell metadata information (with correct clustering
annotation).We will combine all we need into a new
SingleCellExperiment object, and then convert it into a
Seurat object.
First, let’s create a new SingleCellExperiment
object.
# check that the cell IDs are the same from both sources
table(colnames(sce_human_cellgene) == colnames(sce_human_cellatlas), useNA="ifany")
##
## TRUE
## 255901
# table(rownames(colData(sce_human_cellgene)) == rownames(colData(sce_human_cellatlas)), useNA="ifany")
# check that umap coordinates are the same from both sources
table(reducedDim(sce_human_cellgene, "X_umap") == reducedDim(sce_human_cellatlas, "X_umap"), useNA="ifany")
##
## TRUE
## 511802
# create new SCE object
sce_human_clean <- SingleCellExperiment(
list(counts=assay(sce_human_cellgene)),
colData=colData(sce_human_cellatlas),
rowData=rowData(sce_human_cellgene),
reducedDims=list(umap=reducedDim(sce_human_cellgene, "X_umap"))
)
print(sce_human_clean)
## class: SingleCellExperiment
## dim: 32839 255901
## metadata(0):
## assays(1): counts
## rownames(32839): ENSG00000000003 ENSG00000000005 ... ENSG00000283118
## ENSG00000283125
## rowData names(5): feature_is_filtered feature_name feature_reference
## feature_biotype feature_length
## colnames(255901): FCAImmP7179369-AAACCTGAGCCCAATT
## FCAImmP7179369-AAACCTGAGCCTATGT ...
## Human_colon_16S7985397-TTTGTCAAGCTGAACG
## Human_colon_16S7985397-TTTGTCAGTATTAGCC
## colData names(16): Anno_level_1 Anno_level_2 ... Gender Source
## reducedDimNames(1): umap
## mainExpName: NULL
## altExpNames(0):
# switch rownames to gene symbols
# table(duplicated(rowData(sce_human_clean)$feature_name), useNA="ifany") # last sanity check: no duplicates :)
rownames(sce_human_clean) <- rowData(sce_human_clean)$feature_name
Let’s now convert into a seurat object.
# convert to seurat
seur_human <- Seurat::as.Seurat(sce_human_clean, data=NULL)
# remove genes which have 0 total count
seur_human <- seur_human[rowSums(seur_human)!=0,]
SCpubr::do_DimPlot(
seur_human,
group.by="Anno_level_fig1",
legend.position="right"
)
saveRDS(seur_human, "./data_github/park_dataset/park_seurat_human.rds")
The h5ad file was downloaded from the latest version of
the zenodo repository
of the paper (v1.0.2). Let’s import it as a
SingleCellExperiment object once again, and convert it to a
Seurat object.
sce_mouse <- zellkonverter::readH5AD("~/Projects/HumanThymusProject/data_github/park_dataset/HTA08_v02_A04_Science_mouse_total.h5ad")
print(sce_mouse)
## class: SingleCellExperiment
## dim: 17996 36084
## metadata(0):
## assays(1): X
## rownames(17996): 0610005C13Rik 0610009B22Rik ... Zzz3 a
## rowData names(0):
## colnames(36084): FCAImmP8084852-AAACCCACATAACGGG
## FCAImmP8084852-AAACGAAAGCGGGTTA ...
## GTCTTATGTGCG-GSM2883197_E17_5_wholeThy_1
## ACTGAGATCAAT-GSM2883197_E17_5_wholeThy_1
## colData names(4): cell.types stage age sample_ID
## reducedDimNames(1): X_umap
## mainExpName: NULL
## altExpNames(0):
# check out metadata
colData(sce_mouse)
## DataFrame with 36084 rows and 4 columns
## cell.types stage age
## <factor> <factor> <factor>
## FCAImmP8084852-AAACCCACATAACGGG DP(Q) postnatal 4W
## FCAImmP8084852-AAACGAAAGCGGGTTA DP(Q) postnatal 4W
## FCAImmP8084852-AAACGAACAGACGCTC αβT(entry) postnatal 4W
## FCAImmP8084852-AAACGAACAGAGATTA CD4+T postnatal 4W
## FCAImmP8084852-AAACGAAGTACCCACG DP(P) postnatal 4W
## ... ... ... ...
## AGATAGAGTACA-GSM2883197_E17_5_wholeThy_1 DN(Q) prenatal E17
## TTAAAATACTGA-GSM2883197_E17_5_wholeThy_1 DP(P) prenatal E17
## GTTTCTGCACTG-GSM2883197_E17_5_wholeThy_1 DN(Q) prenatal E17
## GTCTTATGTGCG-GSM2883197_E17_5_wholeThy_1 DP(P) prenatal E17
## ACTGAGATCAAT-GSM2883197_E17_5_wholeThy_1 DP(P) prenatal E17
## sample_ID
## <factor>
## FCAImmP8084852-AAACCCACATAACGGG MM_TH_4W_LI
## FCAImmP8084852-AAACGAAAGCGGGTTA MM_TH_4W_LI
## FCAImmP8084852-AAACGAACAGACGCTC MM_TH_4W_LI
## FCAImmP8084852-AAACGAACAGAGATTA MM_TH_4W_LI
## FCAImmP8084852-AAACGAAGTACCCACG MM_TH_4W_LI
## ... ...
## AGATAGAGTACA-GSM2883197_E17_5_wholeThy_1 GSM2883197_E17_5_wholeThy_1
## TTAAAATACTGA-GSM2883197_E17_5_wholeThy_1 GSM2883197_E17_5_wholeThy_1
## GTTTCTGCACTG-GSM2883197_E17_5_wholeThy_1 GSM2883197_E17_5_wholeThy_1
## GTCTTATGTGCG-GSM2883197_E17_5_wholeThy_1 GSM2883197_E17_5_wholeThy_1
## ACTGAGATCAAT-GSM2883197_E17_5_wholeThy_1 GSM2883197_E17_5_wholeThy_1
# check umap and cell type annotation
plotReducedDim(sce_mouse, dimred="X_umap", colour_by="cell.types")
# check out gene names
table(duplicated(rownames(sce_mouse)), useNA="ifany") # none of the gene symbols are duplicated (ideal :)
##
## FALSE
## 17996
# create new SCE object
sce_mouse_clean <- SingleCellExperiment(
list(counts=assay(sce_mouse)),
colData=colData(sce_mouse),
rowData=rowData(sce_mouse),
reducedDims=list(umap=reducedDim(sce_mouse, "X_umap"))
)
print(sce_mouse_clean)
## class: SingleCellExperiment
## dim: 17996 36084
## metadata(0):
## assays(1): counts
## rownames(17996): 0610005C13Rik 0610009B22Rik ... Zzz3 a
## rowData names(0):
## colnames(36084): FCAImmP8084852-AAACCCACATAACGGG
## FCAImmP8084852-AAACGAAAGCGGGTTA ...
## GTCTTATGTGCG-GSM2883197_E17_5_wholeThy_1
## ACTGAGATCAAT-GSM2883197_E17_5_wholeThy_1
## colData names(4): cell.types stage age sample_ID
## reducedDimNames(1): umap
## mainExpName: NULL
## altExpNames(0):
# convert to seurat
seur_mouse <- Seurat::as.Seurat(sce_mouse_clean, data=NULL)
# remove genes which have 0 total count
seur_mouse <- seur_mouse[rowSums(seur_mouse)!=0,]
# plot cell annotation
SCpubr::do_DimPlot(
seur_mouse,
group.by="cell.types",
legend.position="right"
)
saveRDS(seur_mouse, "./data_github/park_dataset/park_seurat_mouse.rds")
sessionInfo()
## R version 4.1.3 (2022-03-10)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur/Monterey 10.16
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics utils stats4 methods base
##
## other attached packages:
## [1] SCpubr_2.0.2 scater_1.22.0
## [3] scuttle_1.4.0 SingleCellExperiment_1.16.0
## [5] SummarizedExperiment_1.24.0 Biobase_2.54.0
## [7] GenomicRanges_1.46.1 GenomeInfoDb_1.30.1
## [9] IRanges_2.28.0 S4Vectors_0.32.4
## [11] BiocGenerics_0.40.0 MatrixGenerics_1.6.0
## [13] matrixStats_1.0.0 SeuratObject_4.1.3
## [15] Seurat_4.3.0.1 forcats_1.0.0
## [17] stringr_1.5.0 dplyr_1.1.2
## [19] purrr_1.0.1 readr_2.1.4
## [21] tidyr_1.3.0 tibble_3.2.1
## [23] tidyverse_1.3.2 ggplot2_3.4.2
##
## loaded via a namespace (and not attached):
## [1] utf8_1.2.3 spatstat.explore_3.2-1
## [3] reticulate_1.30 tidyselect_1.2.0
## [5] htmlwidgets_1.6.2 grid_4.1.3
## [7] BiocParallel_1.28.3 Rtsne_0.16
## [9] zellkonverter_1.4.0 munsell_0.5.0
## [11] ScaledMatrix_1.2.0 codetools_0.2-19
## [13] ica_1.0-3 future_1.33.0
## [15] miniUI_0.1.1.1 withr_2.5.0
## [17] spatstat.random_3.1-5 colorspace_2.1-0
## [19] progressr_0.13.0 filelock_1.0.2
## [21] highr_0.10 knitr_1.43
## [23] rstudioapi_0.14 ROCR_1.0-11
## [25] tensor_1.5 listenv_0.9.0
## [27] labeling_0.4.2 GenomeInfoDbData_1.2.7
## [29] polyclip_1.10-4 farver_2.1.1
## [31] rprojroot_2.0.3 basilisk_1.6.0
## [33] parallelly_1.36.0 vctrs_0.6.3
## [35] generics_0.1.3 xfun_0.39
## [37] timechange_0.2.0 R6_2.5.1
## [39] ggbeeswarm_0.7.2 rsvd_1.0.5
## [41] gridGraphics_0.5-1 bitops_1.0-7
## [43] spatstat.utils_3.0-3 cachem_1.0.8
## [45] DelayedArray_0.20.0 assertthat_0.2.1
## [47] promises_1.2.0.1 scales_1.2.1
## [49] googlesheets4_1.1.1 beeswarm_0.4.0
## [51] gtable_0.3.3 beachmat_2.10.0
## [53] globals_0.16.2 goftest_1.2-3
## [55] rlang_1.1.1 splines_4.1.3
## [57] lazyeval_0.2.2 gargle_1.5.1
## [59] spatstat.geom_3.2-1 broom_1.0.5
## [61] yaml_2.3.7 reshape2_1.4.4
## [63] abind_1.4-5 modelr_0.1.11
## [65] backports_1.4.1 httpuv_1.6.11
## [67] tools_4.1.3 ggplotify_0.1.1
## [69] ellipsis_0.3.2 jquerylib_0.1.4
## [71] RColorBrewer_1.1-3 ggridges_0.5.4
## [73] Rcpp_1.0.10 plyr_1.8.8
## [75] sparseMatrixStats_1.6.0 zlibbioc_1.40.0
## [77] RCurl_1.98-1.12 basilisk.utils_1.6.0
## [79] deldir_1.0-9 pbapply_1.7-2
## [81] viridis_0.6.3 cowplot_1.1.1
## [83] zoo_1.8-12 haven_2.5.3
## [85] ggrepel_0.9.3 cluster_2.1.4
## [87] here_1.0.1 fs_1.6.2
## [89] magrittr_2.0.3 data.table_1.14.8
## [91] scattermore_1.2 lmtest_0.9-40
## [93] reprex_2.0.2 RANN_2.6.1
## [95] googledrive_2.1.1 fitdistrplus_1.1-11
## [97] hms_1.1.3 patchwork_1.1.2
## [99] mime_0.12 evaluate_0.21
## [101] xtable_1.8-4 readxl_1.4.2
## [103] gridExtra_2.3 compiler_4.1.3
## [105] KernSmooth_2.23-21 crayon_1.5.2
## [107] htmltools_0.5.5 later_1.3.1
## [109] tzdb_0.4.0 lubridate_1.9.2
## [111] DBI_1.1.3 dbplyr_2.3.2
## [113] MASS_7.3-60 Matrix_1.5-4.1
## [115] cli_3.6.3 datasets_4.1.3
## [117] grDevices_4.1.3 parallel_4.1.3
## [119] igraph_1.5.0 pkgconfig_2.0.3
## [121] dir.expiry_1.2.0 sp_2.0-0
## [123] plotly_4.10.2 spatstat.sparse_3.0-2
## [125] xml2_1.3.4 vipor_0.4.5
## [127] bslib_0.5.0 XVector_0.34.0
## [129] rvest_1.0.3 yulab.utils_0.0.6
## [131] digest_0.6.32 sctransform_0.3.5
## [133] RcppAnnoy_0.0.21 spatstat.data_3.0-1
## [135] rmarkdown_2.23 cellranger_1.1.0
## [137] leiden_0.4.3 uwot_0.1.16
## [139] DelayedMatrixStats_1.16.0 shiny_1.7.4
## [141] lifecycle_1.0.3 nlme_3.1-162
## [143] jsonlite_1.8.7 BiocNeighbors_1.12.0
## [145] viridisLite_0.4.2 fansi_1.0.4
## [147] pillar_1.9.0 lattice_0.21-8
## [149] fastmap_1.1.1 httr_1.4.6
## [151] survival_3.5-5 glue_1.6.2
## [153] png_0.1-8 stringi_1.7.12
## [155] sass_0.4.6 BiocSingular_1.10.0
## [157] irlba_2.3.5.1 future.apply_1.11.0